// PURPOSE: Create first-time ninth-grade cohorts who we observe for at least
// three additional years after the first ninth grade.

clear all

// SET MACROS
global Input  "Y:/limited/Michigan_CTE/funding_change/data_derived/imported"
global Output "Y:/limited/Michigan_CTE/funding_change/data_derived"

// LOAD DATA
use $Input/k12_student.dta

// RENAME VARIABLES
rename ric student
rename grade_fnl grade
rename bcode_weight school
rename povertyflag econdis
rename speddummy sped
rename frac_attend attendance
rename censusblockgroup1 census_block

// CLEAN DATA
gen demographics_imp = inlist(.,white,black,hisp,female,econdis,sped)
recode white black hisp female econdis sped (.=0)

// Another race category
gen another_race = white == 0 & black == 0 & hisp == 0

// Male
gen male = 1 - female

// School race variables
foreach var of varlist white black hisp another_race {
  bys year school: gegen school_`var' = mean(`var')
}

// Limited English proficiency
replace lep = "0" if lep == "N"
replace lep = "1" if lep == "Y"

// Eighth-grade attendance
// Use final eighth-grade observation if multiple eighth grades
bys student grade (year): gen attendance_g8_ = attendance if grade == 8 & _n == _N
bys student (grade year): gegen attendance_g8 = max(attendance_g8_)

// Year of first ninth grade
gen cohort = year if grade == 9
bys student: gegen cohort9 = min(cohort)

// Identify the three years after first ninth grade
forvalues x = 1/3 {
  gen cohort9_`x'_ = 1 if year == cohort9 + `x'
  bys student (year): gegen cohort9_`x' = max(cohort9_`x'_)
}

// Generate Economic Disadvantage
bys student: gegen poor_temp = max(econdis if year>=cohort9)
replace econdis = poor_temp

// Limit to 2007-16 ninth-grade cohorts, observed for four consecutive years
keep if inrange(cohort9, 2007, 2016) & (cohort == cohort9) & (cohort9_1 + cohort9_2 + cohort9_3 == 3)

// Identify expected graduation year
gen grad_year = cohort9 + 3

// Impute missing eighth-grade attendance using cohort average attendance
gen attendance_g8_imp = missing(attendance_g8)

bys cohort9: gegen attendance_g8_cohort = mean(attendance_g8)
replace attendance_g8 = attendance_g8_cohort if missing(attendance_g8)


// DESTRING VARIABLES
destring lep, replace

// KEEP VARIABLES
keep student school female white black hisp econdis sped lep another_race male ///
  school_white school_black school_hisp school_another_race attendance_g8 cohort9 ///
  grad_year attendance_g8_imp demographics_imp census_block

// ORDER VARIABLES
order student cohort9 school female male black hisp white another_race econdis sped ///
  lep attendance_g8 attendance_g8_imp demographics_imp grad_year school_black school_hisp school_white ///
  school_another_race census_block

// FORMAT VARIABLES
format student %16.0f
format attendance_g8 school_black school_hisp school_white school_another_race %6.3f

// VARIABLE LABELS
label var student "Student ID"
label var cohort9 "Ninth-grade cohort"
label var school "School"
label var female "Female"
label var male "Male"
label var black "Black"
label var hisp "Hispanic"
label var white "White"
label var another_race "Another race"
label var econdis "Ever economically disadvantaged"
label var sped "Special education"
label var lep "Limited English proficiency"
label var attendance_g8 "Grade 8 attendance"
label var attendance_g8_imp "Imputed Grade 8 attendance"
label var attendance_g8_imp "Imputed Characterstic Demographic"
label var grad_year "Expected exit year"
label var school_black "School proportion Black"
label var school_hisp "School proportion Hispanic"
label var school_white "School proportion White"
label var school_another_race "School proportion another race"
label var census_block "Census block"

// VALUES LABELS

// Define value labels
label define cohort9 ///
  2007 "2006-07" ///
  2008 "2007-08" ///
  2009 "2008-09" ///
  2010 "2009-10" ///
  2011 "2010-11" ///
  2012 "2011-12" ///
  2013 "2012-13" ///
  2014 "2013-14" ///
  2015 "2014-15" ///
  2016 "2015-16"

label define male ///
  0 "Female" ///
  1 "Male"

label define another_race ///
  0 "Not another race" ///
  1 "Another race"

label define lep ///
  0 "English proficient" ///
  1 "Limited English proficiency"

label define grad_year ///
  2010 "2009-10" ///
  2011 "2010-11" ///
  2012 "2011-12" ///
  2013 "2012-13" ///
  2014 "2013-14" ///
  2015 "2014-15" ///
  2016 "2015-16" ///
  2017 "2016-17" ///
  2018 "2017-18" ///
  2019 "2018-19"

// Label values
label values cohort9 cohort9
label values male male
label values another_race another_race
label values lep lep
label values grad_year grad_year

// SORT & SAVE
sort cohort9 student
compress
save ${Output}/g9_cohorts.dta, replace
